# -*- coding: utf-8 -*-

import scrapy
from itemloaders.processors import TakeFirst, Compose, MapCompose
from w3lib.html import remove_tags


class WebPageItem(scrapy.Item):
    """网页数据项"""
    track_id = scrapy.Field()  # 唯一标识符
    url = scrapy.Field()  # 页面URL
    category = scrapy.Field()  # 分类
    publish_time = scrapy.Field()  # 发布时间
    title = scrapy.Field()  # 页面标题
    main_body = scrapy.Field()  # 正文内容
    main_file = scrapy.Field()  # 主要文件列表
    attachment_file = scrapy.Field()  # 附件文件列表
    remark = scrapy.Field()  # 备注信息

    images = scrapy.Field()  # 图片URLs列表
    videos = scrapy.Field()  # 视频URLs列表
    audios = scrapy.Field()  # 音频URLs列表
    main_files = scrapy.Field()  # PDF URLs列表
    attachment_files = scrapy.Field()  # PDF URLs列表


class MediaItem(scrapy.Item):
    """媒体文件项"""
    url = scrapy.Field()  # 原始URL
    sha256_hash = scrapy.Field()  # SHA256哈希值
    file_type = scrapy.Field()  # 文件类型 (image/video/audio/pdf)
    site_name = scrapy.Field()  # 所属站点
    content = scrapy.Field()  # 文件内容（字节）
    filename = scrapy.Field()  # 本地文件名
    file_extension = scrapy.Field()  # 文件扩展名 